Clear out your workspace

rm(list=ls(all=FALSE))
ls()
## character(0)

Load packages and function

library(car)
library(dplyr)
library(ggplot2)
library(openxlsx)
library(tidyverse)
library(data.table)
library(EnvStats)
is.nan.data.frame <- function(x)
  do.call(cbind, lapply(x, is.nan))

Import data

Data are stored in the tab “HA - per well” in the excel file “Data_paper” in Sandra’s OneDrive: https://panthers-my.sharepoint.com/personal/mclellan_uwm_edu/Documents/Normalization paper Sept 2020/.

data<-read.table("~/OneDrive - UWM/Normalization paper Sept 2020/Data/Data_paper_HA.txt", sep = "\t", h=T)
head(data)
##    CV Daily.Avg.Flow..MGD. TSS..mg.L.  pH Total.P..mg.L. BOD..mg.L. Temp...C.
## 1 322                 9.73        144 7.7           2.90        134      20.6
## 2 322                 9.73        144 7.7           2.90        134      20.6
## 3 322                 9.73        144 7.7           2.90        134      20.6
## 4 323                 9.11        184 7.7           3.71        157      21.7
## 5 323                 9.11        184 7.7           3.71        157      21.7
## 6 350                 7.82        154 7.7           4.18        174       9.0
##   Conductivity..uS.cm.       City                    Sample_ID
## 1                   NA Brookfield BRK  9/14-1 HA frozen filter
## 2                   NA Brookfield BRK  9/14-1 HA frozen filter
## 3                   NA Brookfield BRK  9/14-1 HA frozen filter
## 4                   NA Brookfield BRK  9/15-1 HA frozen filter
## 5                   NA Brookfield BRK  9/15-1 HA frozen filter
## 6                   NA Brookfield BRK  9/21-1 HA frozen filter
##                Run_N1N2 Well Collection_Date_Start   N1_avg   N1_LOD   N1_LOQ
## 1 9-20 N1N2 BCOV (HEPG)  E05            09/13/2020  1315.20 2121.027 7166.087
## 2 9-20 N1N2 BCOV (HEPG)  E06            09/13/2020   648.96 2121.027 7166.087
## 3             9-22 BCOV  F08            09/13/2020       NA       NA       NA
## 4 9-20 N1N2 BCOV (HEPG)  G05            09/14/2020  3206.40 2121.027 7166.087
## 5 9-20 N1N2 BCOV (HEPG)  G06            09/14/2020  4166.40 2121.027 7166.087
## 6    9-24 N1N2 mad stnd  C05            09/20/2020 10051.20 2121.027 7166.087
##    N2_avg   N2_LOD   N2_LOQ     BCoV PMMoV HF183 N1_LevelDetection
## 1 1315.20 3569.441 6736.762       NA    NA    NA               BLD
## 2  648.96 3569.441 6736.762       NA    NA    NA               BLD
## 3      NA       NA       NA 1.588519    NA    NA              <NA>
## 4 1920.00 3569.441 6736.762       NA    NA    NA               BLQ
## 5 1190.40 3569.441 6736.762       NA    NA    NA               BLQ
## 6 4694.40 3569.441 6736.762       NA    NA    NA      Quantifiable
##   N2_LevelDetection N1N2_avg N1_positive N2_positive    ddPCR_run_info
## 1               BLD  1315.20           2           2 before_11-20-2020
## 2               BLD   648.96           1           1 before_11-20-2020
## 3              <NA>       NA           0           0 before_11-20-2020
## 4               BLD  2563.20           5           3 before_11-20-2020
## 5               BLD  2678.40           7           2 before_11-20-2020
## 6               BLQ  7372.80          15           7 before_11-20-2020
### count number of N1/N2 assay replicate per filter
N1N2.assay.count<-aggregate(N1_avg ~ Sample_ID, subset(data, N1_avg!="NA"), FUN = length)
names(N1N2.assay.count)[2]<-"N1N2_n_assay_per_filter"
data<-as.data.frame(setDT(N1N2.assay.count)[data, on="Sample_ID"])

### count number of filter replicate per sample
filter.count<-aggregate(N1_avg ~ CV + Sample_ID, subset(data, N1_avg!="NA"), FUN = length)
filter.count<-aggregate(N1_avg ~ CV, filter.count, FUN = length)
names(filter.count)[2]<-"n_filter"
data<-as.data.frame(setDT(filter.count)[data, on="CV"])

Data per well

N1 vs N2

data.quantifiable<-subset(data, N1_LevelDetection=="Quantifiable" & N2_LevelDetection=="Quantifiable")

# All data (N2-single quenched probe and N2-double quenched probe) 
nrow(data.quantifiable)
## [1] 410
cor.test(log10(data.quantifiable$N1_avg), log10(data.quantifiable$N2_avg), method = c("spearman"))
## Warning in cor.test.default(log10(data.quantifiable$N1_avg),
## log10(data.quantifiable$N2_avg), : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  log10(data.quantifiable$N1_avg) and log10(data.quantifiable$N2_avg)
## S = 1596145, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8610449
mean(log10(data.quantifiable$N1_avg)/log10(data.quantifiable$N2_avg))
## [1] 1.055377
sd(log10(data.quantifiable$N1_avg)/log10(data.quantifiable$N2_avg))
## [1] 0.03752946
cor.test(data.quantifiable$N1_avg, (data.quantifiable$N2_avg), method = c("spearman"))
## Warning in cor.test.default(data.quantifiable$N1_avg,
## (data.quantifiable$N2_avg), : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  data.quantifiable$N1_avg and (data.quantifiable$N2_avg)
## S = 1596145, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8610449
mean((data.quantifiable$N1_avg)/(data.quantifiable$N2_avg))
## [1] 1.832281
sd((data.quantifiable$N1_avg)/(data.quantifiable$N2_avg))
## [1] 0.7192024
# N1 + N2-single quenched probe
data.N2singlequenched<-subset(data.quantifiable, ddPCR_run_info == "before_11-20-2020")
nrow(data.N2singlequenched)
## [1] 323
mean(log10(data.N2singlequenched$N1_avg)/log10(data.N2singlequenched$N2_avg))
## [1] 1.054348
sd(log10(data.N2singlequenched$N1_avg)/log10(data.N2singlequenched$N2_avg))
## [1] 0.03596731
mean((data.N2singlequenched$N1_avg)/(data.N2singlequenched$N2_avg))
## [1] 1.794686
sd((data.N2singlequenched$N1_avg)/(data.N2singlequenched$N2_avg))
## [1] 0.6610602
# N1 + N2-double quenched probe
data.N2doublequenched<-subset(data.quantifiable, ddPCR_run_info == "after_11-20-2020")
nrow(data.N2doublequenched)
## [1] 87
mean(log10(data.N2doublequenched$N1_avg)/log10(data.N2doublequenched$N2_avg))
## [1] 1.059196
sd(log10(data.N2doublequenched$N1_avg)/log10(data.N2doublequenched$N2_avg))
## [1] 0.04284842
mean((data.N2doublequenched$N1_avg)/(data.N2doublequenched$N2_avg))
## [1] 1.971859
sd((data.N2doublequenched$N1_avg)/(data.N2doublequenched$N2_avg))
## [1] 0.8937146

How many time…

#### ALL DATES #### 
#... N1 > LOQ & N2 < LOQ
comparison.1<-subset(data, N1_LevelDetection=="Quantifiable" & N2_LevelDetection!="Quantifiable")
nrow(comparison.1)
## [1] 137
#... N1 < LOQ & N2 > LOQ
comparison.2<-subset(data, N1_LevelDetection!="Quantifiable" & N2_LevelDetection=="Quantifiable")
nrow(comparison.2)
## [1] 19
#All N1N2 data
comparison.3<-subset(data, N1_LevelDetection!="NA" & N2_LevelDetection!="NA")
nrow(comparison.3)
## [1] 820
#### BEFORE 11-20-2020 #### 
#... N1 > LOQ & N2 < LOQ
comparison.1.before<-subset(comparison.1, ddPCR_run_info=="before_11-20-2020")
nrow(comparison.1.before)
## [1] 113
#... N1 < LOQ & N2 > LOQ
comparison.2.before<-subset(comparison.2, ddPCR_run_info=="before_11-20-2020")
nrow(comparison.2.before)
## [1] 19
#All N1N2 data
comparison.3.before<-subset(comparison.3, ddPCR_run_info=="before_11-20-2020")
nrow(comparison.3.before)
## [1] 692
#### AFTER 11-20-2020 #### 
#... N1 > LOQ & N2 < LOQ
comparison.1.after<-subset(comparison.1, ddPCR_run_info=="after_11-20-2020")
nrow(comparison.1.after)
## [1] 24
#... N1 < LOQ & N2 > LOQ
comparison.2.after<-subset(comparison.2, ddPCR_run_info=="after_11-20-2020")
nrow(comparison.2.after)
## [1] 0
#All N1N2 data
comparison.3.after<-subset(comparison.3, ddPCR_run_info=="after_11-20-2020")
nrow(comparison.3.after)
## [1] 128
#### ALL DATES #### 
#... N1 > LOD & N2 < LOQ
comparison.1<-subset(data, N1_LevelDetection !="BLD" & N2_LevelDetection=="BLD")
nrow(comparison.1)
## [1] 139
#... N1 < LOD & N2 > LOD
comparison.2<-subset(data, N1_LevelDetection=="BLD" & N2_LevelDetection!="BLD")
nrow(comparison.2)
## [1] 7
#All N1N2 data
comparison.3<-subset(data, N1_LevelDetection!="NA" & N2_LevelDetection!="NA")
nrow(comparison.3)
## [1] 820
#### BEFORE 11-20-2020 #### 
#... N1 > LOD & N2 < LOQ
comparison.1.before<-subset(comparison.1, ddPCR_run_info=="before_11-20-2020")
nrow(comparison.1.before)
## [1] 135
#... N1 < LOD & N2 > LOD
comparison.2.before<-subset(comparison.2, ddPCR_run_info=="before_11-20-2020")
nrow(comparison.2.before)
## [1] 6
#All N1N2 data
comparison.3.before<-subset(comparison.3, ddPCR_run_info=="before_11-20-2020")
nrow(comparison.3.before)
## [1] 692
#### AFTER 11-20-2020 #### 
#... N1 > LOD & N2 < LOQ
comparison.1.after<-subset(comparison.1, ddPCR_run_info=="after_11-20-2020")
nrow(comparison.1.after)
## [1] 4
#... N1 < LOD & N2 > LOD
comparison.2.after<-subset(comparison.2, ddPCR_run_info=="after_11-20-2020")
nrow(comparison.2.after)
## [1] 1
#All N1N2 data
comparison.3.after<-subset(comparison.3, ddPCR_run_info=="after_11-20-2020")
nrow(comparison.3.after)
## [1] 128

Plot N2 vs N1

ggplot(data.quantifiable, aes(x=log10(N2_avg), y=log10(N1_avg))) +
  geom_point(aes(color=City),size=0.5) + 
  theme_classic() + 
  scale_x_continuous(limits = c(2.5, 5.5)) + 
  scale_y_continuous(limits = c(2.5, 5.5)) +
  labs(y = "log10 - N1 cp/L", 
       x = "log10 - N2 cp/L",
       title = "1 point = one assay when quantifiable") + 
  geom_abline(intercept = 0, slope = 1) + 
  stat_smooth(method = "lm", col = "red",size=0.5)
## `geom_smooth()` using formula 'y ~ x'

Data averaged per filter (assay variability)

data.per.filter<-data %>%
  group_by(Sample_ID, City, CV, Collection_Date_Start) %>%
  summarize(N1.avg = mean(N1_avg, na.rm=TRUE),
            N1.rsd = sd(N1_avg, na.rm=TRUE)/mean(N1_avg, na.rm=TRUE),
            N1.LevelDetection = sum(ifelse(N1_LevelDetection=="Quantifiable", 100, ifelse(N1_LevelDetection=="BLQ", 1,0)), na.rm=TRUE),
            N1.count = sum(!is.na(N1_LevelDetection)),

            BCoV.avg_recovery_rate = mean(BCoV, na.rm=TRUE),
            PMMoV.avg = mean(PMMoV, na.rm=TRUE),
            HF183.avg = mean(HF183, na.rm=TRUE),
            TSS.avg = mean(as.numeric(as.character(TSS..mg.L.)),na.rm = TRUE),
            Flow.avg = mean(as.numeric(as.character(Daily.Avg.Flow..MGD.)),na.rm = TRUE),
            .groups = 'drop')

data.per.filter<-as.data.frame(data.per.filter)
data.per.filter.NaN<-is.nan.data.frame(data.per.filter)
data.per.filter[data.per.filter.NaN]<-NA

names(data.per.filter)<-sub("[.]", "_", names(data.per.filter)); names(data.per.filter)<-sub("[.]", "_", names(data.per.filter)) # replace "." by "_"in header

# Recompute the level of detection
data.per.filter$N1_LowestLevelDetection<-ifelse(data.per.filter$N1_LevelDetection/data.per.filter$N1_count==100, "Quantifiable", ifelse(data.per.filter$N1_LevelDetection/data.per.filter$N1_count>=1, "BLQ", "BLD"))
data.per.filter$N1_LevelDetection<-ifelse(data.per.filter$N1_LevelDetection>=100, "Quantifiable", ifelse(data.per.filter$N1_LevelDetection>0, "BLQ", "BLD"))

Data averaged per day (filter + assay variability)

data.per.day<-data.per.filter %>%
  group_by(CV, City,Collection_Date_Start) %>%
  summarize(N1.avg.day = mean(N1_avg, na.rm=TRUE), 
            N1.LevelDetection.day = sum(ifelse(N1_LevelDetection=="Quantifiable", 100, ifelse(N1_LevelDetection=="BLQ", 1,0)), na.rm=TRUE),
            N1.LowestLevelDetection.day = sum(ifelse(N1_LowestLevelDetection=="Quantifiable", 100, ifelse(N1_LowestLevelDetection=="BLQ", 1,0)), na.rm=TRUE),
            N1.count.day = sum(!is.na(N1_LevelDetection)),
            N1.rsd.day = sd(N1_avg, na.rm=TRUE)/mean(N1_avg, na.rm=TRUE),

            BCoV.avg.day = mean(BCoV_avg_recovery_rate, na.rm=TRUE),
            BCoV.rsd.day = sd(BCoV_avg_recovery_rate, na.rm=TRUE)/mean(BCoV_avg_recovery_rate, na.rm=TRUE),            

            PMMoV.avg.day = mean(PMMoV_avg, na.rm=TRUE),
            PMMoV.rsd.day = sd(PMMoV_avg, na.rm=TRUE)/mean(PMMoV_avg, na.rm=TRUE),

            HF183.avg.day = mean(HF183_avg, na.rm=TRUE),
            HF183.rsd.day = sd(HF183_avg, na.rm=TRUE)/mean(HF183_avg, na.rm=TRUE),
    
            TSS.avg.day = mean(as.numeric(as.character(TSS_avg)),na.rm = TRUE),
            Flow.avg.day = mean(as.numeric(as.character(Flow_avg)),na.rm = TRUE),
            .groups = 'drop')

data.per.day<-as.data.frame(data.per.day)
data.per.day.NaN<-is.nan.data.frame(data.per.day)
data.per.day[data.per.day.NaN]<-NA
names(data.per.day)<-sub("[.]", "_", names(data.per.day)); names(data.per.day)<-sub("[.]", "_", names(data.per.day))

data.per.day$N1_LowestLevelDetection_day<-ifelse(data.per.day$N1_LowestLevelDetection_day/data.per.day$N1_count_day==100, "Quantifiable", ifelse(data.per.day$N1_LowestLevelDetection_day/data.per.day$N1_count_day>=1, "BLQ", "BLD"))
data.per.day$N1_LevelDetection_day<-ifelse(data.per.day$N1_LevelDetection_day>=100, "Quantifiable", ifelse(data.per.day$N1_LevelDetection_day>0, "BLQ", "BLD"))

Recovery controls overview in all available sites and dates data

BCoV

BCoV vs WWTPs

# All data
mean(data.per.day$BCoV_avg_day, na.rm = TRUE)
## [1] 4.937945
sd(data.per.day$BCoV_avg_day, na.rm = TRUE)
## [1] 4.232944
# ANOVA BCoV recovery vs WWTPs

    ## ANOVA
mod1 <- aov(log10(BCoV_avg_day) ~ City, data = data.per.day)
summary(mod1)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## City         11  26.52  2.4107   26.91 <2e-16 ***
## Residuals   405  36.28  0.0896                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
plot(mod1)

    ## Test post-hoc
post.hoc1<-TukeyHSD(mod1, ordered = TRUE, conf.level = 0.95)
label1<-multcompView::multcompLetters(post.hoc1$City[,"p adj"])

    ## Plot
label1.df<-as.data.frame(label1$Letters)
names(label1.df)<-"Letter"
label1.df$City<-row.names(label1.df)
data.per.day.bcov.plot<-setDT(label1.df)[data.per.day, on="City"]
ggplot(data.per.day.bcov.plot, aes(x=City, y=BCoV_avg_day)) +
  geom_jitter(width=0.2) + 
  geom_boxplot(outlier.colour=NA, fill=NA, colour="grey20") +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(y = "BCoV recovery (%)", 
       x = "",
       title = "") +
    geom_text(aes(x = City, y = 40, label = Letter))
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).

    ## More specific info 
BCOV.Waukesha<-subset(data.per.day.bcov.plot, City == "Waukesha")
min(BCOV.Waukesha$BCoV_avg_day)
## [1] 0.2123006
max(BCOV.Waukesha$BCoV_avg_day)
## [1] 2.984008
nrow(BCOV.Waukesha)
## [1] 41
BCOV.Milwaukee_JI<-subset(data.per.day.bcov.plot, City == "Milwaukee_JI")
min(BCOV.Milwaukee_JI$BCoV_avg_day)
## [1] 0.892347
max(BCOV.Milwaukee_JI$BCoV_avg_day)
## [1] 27.95129
nrow(BCOV.Milwaukee_JI)
## [1] 51

BCoV vs TSS

cor.data0<-data.per.day[,c("City", "TSS_avg_day", "BCoV_avg_day")]
cor.data0[,2]<-as.numeric(as.character(cor.data0[,2]))
cor.data0[,3]<-as.numeric(as.character(cor.data0[,3]))
cor.data0<-subset(cor.data0, cor.data0[,1]!="NA" & cor.data0[,2]!="NA")
nrow(cor.data0)
## [1] 381
cor.test(cor.data0[,2], cor.data0[,3], method = "spearman")
## Warning in cor.test.default(cor.data0[, 2], cor.data0[, 3], method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  cor.data0[, 2] and cor.data0[, 3]
## S = 11538573, p-value = 6.39e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2517899
ggplot(cor.data0, aes(x=TSS_avg_day, y=BCoV_avg_day, colour=cor.data0[,1])) +
  geom_point()

PMMoV

mean(data.per.day$PMMoV_avg_day, na.rm = TRUE)
## [1] 23867825
sd(data.per.day$PMMoV_avg_day, na.rm = TRUE)
## [1] 44003926
sd(data.per.day$PMMoV_avg_day, na.rm=TRUE)/mean(data.per.day$PMMoV_avg_day, na.rm=TRUE)
## [1] 1.84365
#geomean & geoSD for PMMoV
#(values above are in log10 space)
geoMean(data.per.day$PMMoV_avg_day, na.rm = TRUE)
## [1] 17087021
geoSD(data.per.day$PMMoV_avg_day, na.rm = TRUE)
## [1] 2.043773
temp<-data.frame(PMMoV_avg_day=data.per.day$PMMoV_avg_day)
temp$logpmmov<-log10(temp$PMMoV_avg_day)
gmmean<-mean(temp$logpmmov,na.rm=TRUE); gmmean
## [1] 7.232666
gmsd<-sd(temp$logpmmov,na.rm=TRUE); gmsd
## [1] 0.3104326

PMMoV vs WWTPs

# ANOVA PMMoV recovery vs WWTPs
mod2 <- aov(log10(PMMoV_avg_day) ~ City, data = data.per.day)
summary(mod2)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## City         11  9.412  0.8556   11.76 <2e-16 ***
## Residuals   354 25.763  0.0728                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 52 observations deleted due to missingness
plot(mod2)

    ## Test post-hoc
post.hoc2<-TukeyHSD(mod2, ordered = TRUE, conf.level = 0.95)
label2<-multcompView::multcompLetters(post.hoc2$City[,"p adj"])

    ## Plot
label2.df<-as.data.frame(label2$Letters)
names(label2.df)<-"Letter"
label2.df$City<-row.names(label2.df)
data.per.day.bcov.plot<-setDT(label2.df)[data.per.day, on="City"]
ggplot(data.per.day.bcov.plot, aes(x=City, y=log10(PMMoV_avg_day))) +
  geom_jitter(width=0.2) + 
  geom_boxplot(outlier.colour=NA, fill=NA, colour="grey20") +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(y = "PMMoV concentration (log10 cp/L)", 
       x = "",
       title = "") +
    geom_text(aes(x = City, y = 9.2, label = Letter))
## Warning: Removed 52 rows containing non-finite values (stat_boxplot).
## Warning: Removed 52 rows containing missing values (geom_point).

PMMoV_avg vs Flow

# Extract data
cor.pmmov<-data.per.day[,c("City", "PMMoV_avg_day", "Flow_avg_day")]
cor.pmmov[,2]<-log10(as.numeric(as.character(cor.pmmov[,2])))
cor.pmmov[,3]<-log10(as.numeric(as.character(cor.pmmov[,3])))

# Compute correlation per plants
cor.PMMoV.results <- cor.pmmov %>%
  group_by(City) %>%
  summarize(rho=cor(PMMoV_avg_day, Flow_avg_day, use="complete.obs", method = "spearman"), 
            n = sum(!is.na(PMMoV_avg_day) & !is.na(Flow_avg_day)),
            .groups = 'drop')
cor.PMMoV.results$rho.abs<-abs(cor.PMMoV.results$rho)
min(cor.PMMoV.results$rho.abs); max(cor.PMMoV.results$rho.abs)
## [1] 0.1074237
## [1] 0.688731
mean(cor.PMMoV.results$rho.abs); median(cor.PMMoV.results$rho.abs)
## [1] 0.3077994
## [1] 0.2879652
# Prepare data for plot
cor.pmmov.cor<-as.data.frame(setDT(cor.PMMoV.results)[cor.pmmov, on="City"])
cor.pmmov.cor$WWTPs<-paste0(cor.pmmov.cor$City, " (", round(cor.pmmov.cor$rho, digits = 2), ")")

# Plot
ggplot(cor.pmmov.cor, aes(x=Flow_avg_day, y=PMMoV_avg_day, colour=WWTPs)) +
  geom_point() + 
  labs(x = "Average daily flow - log10(MDG)", 
       y = "PMMoV - log10(cp/L)") +
  theme_bw()
## Warning: Removed 55 rows containing missing values (geom_point).

PMMoV-RSD (averaged per plant) vs Flow (averaged per plant)

# Select data
cor.pmmov.rsd<-data.per.day[,c("City", "PMMoV_avg_day", "Flow_avg_day")]
cor.pmmov.rsd<-subset(cor.pmmov.rsd, PMMoV_avg_day!="NA" & Flow_avg_day!="NA")
cor.pmmov.rsd[,2]<-log10(as.numeric(as.character(cor.pmmov.rsd[,2])))
cor.pmmov.rsd[,3]<-as.numeric(as.character(cor.pmmov.rsd[,3]))

#Compute means per plant
cor.pmmov.rsd.results<-cor.pmmov.rsd %>%
  group_by(City) %>%
  summarize(PMMoV.rsd.mean = (sd(PMMoV_avg_day, na.rm=TRUE)/mean(PMMoV_avg_day, na.rm=TRUE))*100, 
            Flow.mean = mean(Flow_avg_day, na.rm=TRUE), 
            n = sum(!is.na(City)),
            .groups = 'drop')
#plot(PMMoV_rsd_day~Flow_avg_day, data=subse

# Correlation
cor.test(cor.pmmov.rsd.results$PMMoV.rsd.mean, cor.pmmov.rsd.results$Flow.mean, method = "spearman", na.action=na.omit)
## 
##  Spearman's rank correlation rho
## 
## data:  cor.pmmov.rsd.results$PMMoV.rsd.mean and cor.pmmov.rsd.results$Flow.mean
## S = 354, p-value = 0.4571
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2377622
# Prepare data for plot
cor.pmmov.rsd.results$WWTPs<-paste0(cor.pmmov.rsd.results$City, " (", round(cor.pmmov.rsd.results$n, digits = 2), ")")

# Plot
ggplot(cor.pmmov.rsd.results, aes(x=Flow.mean, y=PMMoV.rsd.mean, colour=WWTPs)) +
    geom_point() + 
  labs(x = "Averaged daily flow per WWTP - (MDG)", 
       y = "Averaged PMMoV RSD per WWTP (%)") +
  theme_bw()

#ggplot(cor.pmmov.rsd.results, aes(x=log10(Flow.mean), y=log10(PMMoV.rsd.mean*100), colour=City)) +
#    geom_point() + 
#  labs(x = "Averaged daily flow per WWTP - log10(MDG)", 
#       y = "PMMoV RSD log10(%)") +
#  theme_bw()

HF183

mean(data.per.day$HF183_avg_day, na.rm = TRUE)
## [1] 700853478
sd(data.per.day$HF183_avg_day, na.rm = TRUE)
## [1] 374103910
sd(data.per.day$HF183_avg_day, na.rm=TRUE)/mean(data.per.day$HF183_avg_day, na.rm=TRUE)
## [1] 0.5337833
#geomean & geoSD for PMMoV
#(values above are in log10 space)
geoMean(data.per.day$HF183_avg_day, na.rm = TRUE)
## [1] 612436010
geoSD(data.per.day$HF183_avg_day, na.rm = TRUE)
## [1] 1.720642
temp<-data.frame(HF183_avg_day=data.per.day$HF183_avg_day)
temp$loghf183<-log10(temp$HF183_avg_day)
gmmean<-mean(temp$loghf183,na.rm=TRUE); gmmean
## [1] 8.787061
gmsd<-sd(temp$loghf183,na.rm=TRUE); gmsd
## [1] 0.2356906

HF183 vs WWTPs

# ANOVA PMMoV recovery vs WWTPs
mod3 <- aov(log10(HF183_avg_day) ~ City, data = data.per.day)
summary(mod3)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## City         11  6.548  0.5952   17.07 <2e-16 ***
## Residuals   287 10.006  0.0349                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 119 observations deleted due to missingness
plot(mod3)

HF183_avg vs Flow

# Extract data
cor.hf183<-data.per.day[,c("City", "HF183_avg_day", "Flow_avg_day")]
cor.hf183[,2]<-log10(as.numeric(as.character(cor.hf183[,2])))
cor.hf183[,3]<-log10(as.numeric(as.character(cor.hf183[,3])))

# Compute correlation per plants
cor.HF183.results <- cor.hf183 %>%
  group_by(City) %>%
  summarize(rho=cor(HF183_avg_day, Flow_avg_day, use="complete.obs", method = "spearman"), 
            n = sum(!is.na(HF183_avg_day) & !is.na(Flow_avg_day)),
            .groups = 'drop')
cor.HF183.results$rho.abs<-abs(cor.HF183.results$rho)
min(cor.HF183.results$rho.abs); max(cor.HF183.results$rho.abs)
## [1] 0.06593407
## [1] 0.6560381
mean(cor.HF183.results$rho.abs); median(cor.HF183.results$rho.abs)
## [1] 0.3476914
## [1] 0.288188
# Prepare data for plot
cor.hf183.cor<-as.data.frame(setDT(cor.HF183.results)[cor.hf183, on="City"])
cor.hf183.cor$WWTPs<-paste0(cor.hf183.cor$City, " (", round(cor.hf183.cor$rho, digits = 2), ")")

# Plot
ggplot(cor.hf183.cor, aes(x=Flow_avg_day, y=HF183_avg_day, colour=WWTPs)) +
  geom_point() + 
  labs(x = "Average daily flow - log10(MDG)", 
       y = "HF183 - log10(cp/L)") +
  theme_bw()
## Warning: Removed 122 rows containing missing values (geom_point).

HF183-RSD (averaged per plant) vs Flow (averaged per plant)

# Select data
cor.hf183.rsd<-data.per.day[,c("City", "HF183_avg_day", "Flow_avg_day")]
cor.hf183.rsd<-subset(cor.hf183.rsd, HF183_avg_day!="NA" & Flow_avg_day!="NA")
cor.hf183.rsd[,2]<-log10(as.numeric(as.character(cor.hf183.rsd[,2])))
cor.hf183.rsd[,3]<-as.numeric(as.character(cor.hf183.rsd[,3]))

#Compute means per plant
cor.hf183.rsd.results<-cor.hf183.rsd %>%
  group_by(City) %>%
  summarize(HF183.rsd.mean = (sd(HF183_avg_day, na.rm=TRUE)/mean(HF183_avg_day, na.rm=TRUE))*100, 
            Flow.mean = mean(Flow_avg_day, na.rm=TRUE), 
            n = sum(!is.na(City)),
            .groups = 'drop')

# Correlation
cor.test(cor.hf183.rsd.results$HF183.rsd.mean, cor.hf183.rsd.results$Flow.mean, method = "spearman", na.action=na.omit)
## 
##  Spearman's rank correlation rho
## 
## data:  cor.hf183.rsd.results$HF183.rsd.mean and cor.hf183.rsd.results$Flow.mean
## S = 438, p-value = 0.0793
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.5314685
# Prepare data for plot
cor.hf183.rsd.results$WWTPs<-paste0(cor.hf183.rsd.results$City, " (", round(cor.hf183.rsd.results$n, digits = 2), ")")

#plot
ggplot(cor.hf183.rsd.results, aes(x=Flow.mean, y=HF183.rsd.mean, colour=WWTPs)) +
    geom_point() + 
  labs(x = "Averaged daily flow per WWTP - (MDG)", 
       y = "Averaged HF183 RSD per WWTP (%)") +
  theme_bw()

Power of the recovery controls

between dupicate filter, difference between controls and N1

data.per.day.lq<-subset(data.per.day, N1_LowestLevelDetection_day=="Quantifiable")
data.per.day.lq %>%
  summarize(rho_BCoV=cor(N1_rsd_day, BCoV_rsd_day, method = "spearman", use="complete.obs"), 
            n_BCoV = sum(!is.na(N1_rsd_day) & !is.na(BCoV_rsd_day)),
            
            rho_PMMoV=cor(N1_rsd_day, PMMoV_rsd_day, method = "spearman", use="complete.obs"), 
            n_PMMoV = sum(!is.na(N1_rsd_day) & !is.na(PMMoV_rsd_day)))
##    rho_BCoV n_BCoV rho_PMMoV n_PMMoV
## 1 0.1131425     62 0.1733766      55
data.per.day %>%
  summarize(rho_PMMoV_BCoV=cor(BCoV_rsd_day, PMMoV_rsd_day, method = "spearman", use="complete.obs"), 
            n_PMMoV_BCoV = sum(!is.na(PMMoV_rsd_day) & !is.na(BCoV_rsd_day)))
##   rho_PMMoV_BCoV n_PMMoV_BCoV
## 1       0.354479           86